import warnings
warnings.filterwarnings('ignore')
The first step in our analysis consists of an accurate examination of the platform streams in Italy using the Spotify top 200 weekly. The analysis is needed to gain insight into the events that cause the number of streams to increase or decrease.
At first, we will plot the number of streams for the different years: 2017, 2018, 2019, and 2020.
We wish to identify patterns and interesting trends, such as a decrease in the number of streams due to the spread of the pandemic, or event-related peaks.
The first dataframe extracted from our dataset and taken into consideration contains four attributes:
avg_streams : is the average number of streams computed on the weekly top 200, obtained by summing the number of streams of each track in the top 200 in each week and then divided by the number of tracks (200)date : represents the start of the weekyearnumber_of_weeks : Index of the week in a given year, since Spotify does not provide consistent week data.import os
import pandas as pd
import matplotlib.dates as mdates
import matplotlib as plt
from matplotlib.dates import DateFormatter
from IPython.display import display, HTML
import datetime
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
avg_streams = df_temp["Streams"].mean()
#print(avg_streams)
week = short_name[-2:]
month =short_name[8:10]
date = str(year) + '-' + month + '-' + week
#never append with a cycle to a dataframe. Create a list first and then convert to a dataframe https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe
temp = [avg_streams,date,year,number_of_weeks]
list_data.append(temp)
number_of_weeks=number_of_weeks+1
#print(list_data)
df = pd.DataFrame(list_data,columns = ["avg_streams","date","year","number of weeks"])
The resulting dataframe looks as follows:
df.date = pd.to_datetime(df.date)
df.set_index(df.date, inplace=True)
df.sort_index(inplace = True)
df
| avg_streams | date | year | number of weeks | |
|---|---|---|---|---|
| date | ||||
| 2017-01-06 | 145502.475 | 2017-01-06 | 2017 | 0 |
| 2017-01-13 | 158424.445 | 2017-01-13 | 2017 | 1 |
| 2017-01-20 | 166912.255 | 2017-01-20 | 2017 | 2 |
| 2017-01-27 | 169132.215 | 2017-01-27 | 2017 | 3 |
| 2017-02-03 | 181889.905 | 2017-02-03 | 2017 | 4 |
| ... | ... | ... | ... | ... |
| 2020-06-26 | 760505.945 | 2020-06-26 | 2020 | 25 |
| 2020-07-03 | 713891.695 | 2020-07-03 | 2020 | 26 |
| 2020-07-10 | 700764.875 | 2020-07-10 | 2020 | 27 |
| 2020-07-17 | 753817.945 | 2020-07-17 | 2020 | 28 |
| 2020-07-24 | 717685.125 | 2020-07-24 | 2020 | 29 |
186 rows × 4 columns
The results below show that the beginning of the pandemic and the consequent lockdown has caused a significant decrease in the number of streams, compared to the same period in previous years.
We can also notice a constant increase in streams that seem to reach a plateau in 2020/2019. The number of streams during 2017/2018/2019 has a constant upward trend, which seems to settle down between 2020 and 2019. This trend is easily explainable, considering Spotify's growing popularity and the increasing number of total users on the platform.
idx = pd.MultiIndex.from_arrays([
pd.to_datetime(df.index.strftime('2018-%m-%d %H:%M:%S')),
df.index.year
])
d1 = df.set_index(idx).unstack().resample('W').mean()
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib import dates
import seaborn as sns
import numpy as np
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#b0b6ff", "#9fd4c6", "#d3abff","#EF553B"]
sns.set_style('whitegrid')
plot = sns.lineplot(ax=ax, data=d1.avg_streams, color='year', palette=flatui)
lg = ax.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax.get_lines()
line = lines[3]
props = {'linewidth':5}
line.set(**props)
ax.axvspan(*mdates.datestr2num(['2018-03-09', '2018-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax.tick_params(which='major', labelsize=13)
ax.tick_params(which="both", bottom=True)
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
ax.set_title(label='Streams trend', size='15')
ax.legend(loc="upper right")
ax.set_xlabel(xlabel='Date', size='15', style='italic')
ax.set_ylabel(ylabel='Streams', size='15', style='italic')
ax.set_xlim([datetime.date(2018, 1, 1), datetime.date(2018, 12, 31)])
fig.autofmt_xdate()
fig.tight_layout()
ticktext = pd.to_datetime(d1.index.dayofyear, format='%j').to_series().dt.strftime('%m-%d').values
We're inserting here an interactive plot to allow the user to zoom in interesting areas of the plot.
import plotly.graph_objects as go
df_resampled = df.resample('W').mean()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_resampled['2017'].index.week, y=df_resampled['2017'].avg_streams, name='2017',
hovertemplate =
'<b>Streams</b>: %{y:.2f}'+
'<br><i>Date</i>: %{x}<br>'))
fig.add_trace(go.Scatter(x=df_resampled['2018'].index.week, y=df_resampled['2018'].avg_streams, name='2018',
hovertemplate =
'<b>Streams</b>: %{y:.2f}'+
'<br><i>Date</i>: %{x}<br>')) # fill down to xaxis
fig.add_trace(go.Scatter(x=df_resampled['2019'].index.week, y=df_resampled['2019'].avg_streams, name='2019',
hovertemplate =
'<b>Streams</b>: %{y:.2f}'+
'<br><i>Date</i>: %{x}<br>')) # fill down to xaxis
fig.add_trace(go.Scatter(x=df_resampled['2020'].index.week, y=df_resampled['2020'].avg_streams, name='2020',
hovertemplate =
'<b>Streams</b>: %{y:.2f}'+
'<br><i>Date</i>: %{x}<br>')) # fill down to xaxis
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = df_resampled.index.week,
ticktext = ticktext,
tickangle = 90
)
)
The following heatmaps show the distribution of the streamings for each year. Each row is a week of the year, while each column is the top 200 position. We limited our analysis to the first 50 and 15 songs for clarity. Plotting the whole top 200 resulted in an almost completely black heatmap
#Heatmap for each year
import os
import pandas as pd
from IPython.display import display, HTML
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.colors import LogNorm
from scipy import stats
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data_years= []
for year in years:
number_of_weeks = 0
list_data= []
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
array = df_temp['Streams'].values
weekly_sum = df_temp['Streams'].sum()
array = (array / weekly_sum)*100
#print(array.shape)
list_data.append(array)
#list_data.append(temp)
number_of_weeks=number_of_weeks+1
#print(len(list_data))
list_data_years.append(list_data)
array_2017 = np.array(list_data_years[0])[:30,:50]
#print(np.amax(array_2017))
#array_2017 = (array_2017 - np.mean(array_2017)) / np.std(array_2017)
array_2018 = np.array(list_data_years[1])[:30,:50]
#array_2018 = (array_2018 - np.mean(array_2018)) / np.std(array_2018)
#print(np.amax(array_2018))
array_2019 = np.array(list_data_years[2])[:30,:50]
#array_2019 = (array_2019 - np.mean(array_2019)) / np.std(array_2019)
#print(np.amax(array_2019))
array_2020 = np.array(list_data_years[3])[:30,:50]
#array_2020 = np.pad(array_2020, ((0,52-array_2020.shape[0]),(0,0)), constant_values=np.NaN)
#print(np.amax(array_2020))
#array_2020 = (array_2020 - np.mean(array_2020)) / np.std(array_2020)
fig = plt.figure(figsize=(15,15))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.2)
ax0 = fig.add_subplot(gs0[0,0])
ax1 = fig.add_subplot(gs0[0,1])
ax2 = fig.add_subplot(gs0[1,0])
ax3 = fig.add_subplot(gs0[1,1])
ax0.set_title(label='2017', fontsize='15')
ax1.set_title(label='2018', fontsize='15')
ax2.set_title(label='2019', fontsize='15')
ax3.set_title(label='2020', fontsize='15')
sns.heatmap(array_2017, ax=ax0, vmax = 8)
sns.heatmap(array_2018, ax=ax1, vmax = 8)
sns.heatmap(array_2019, ax=ax2, vmax = 8)
sns.heatmap(array_2020, ax=ax3, vmax = 8)
<AxesSubplot:title={'center':'2020'}>
#Heatmap for each year
import os
import pandas as pd
from IPython.display import display, HTML
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.colors import LogNorm
from scipy import stats
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data_years= []
for year in years:
number_of_weeks = 0
list_data= []
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
array = df_temp['Streams'].values
weekly_sum = df_temp['Streams'].sum()
array = (array / weekly_sum)*100
#print(array.shape)
list_data.append(array)
#list_data.append(temp)
number_of_weeks=number_of_weeks+1
#print(len(list_data))
list_data_years.append(list_data)
array_2017 = np.array(list_data_years[0])[:30,:15]
#print(np.amax(array_2017))
#array_2017 = (array_2017 - np.mean(array_2017)) / np.std(array_2017)
array_2018 = np.array(list_data_years[1])[:30,:15]
#array_2018 = (array_2018 - np.mean(array_2018)) / np.std(array_2018)
#print(np.amax(array_2018))
array_2019 = np.array(list_data_years[2])[:30,:15]
#array_2019 = (array_2019 - np.mean(array_2019)) / np.std(array_2019)
#print(np.amax(array_2019))
array_2020 = np.array(list_data_years[3])[:30,:15]
#array_2020 = np.pad(array_2020, ((0,52-array_2020.shape[0]),(0,0)), constant_values=np.NaN)
#print(np.amax(array_2020))
#array_2020 = (array_2020 - np.mean(array_2020)) / np.std(array_2020)
fig = plt.figure(figsize=(15,15))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.2)
ax0 = fig.add_subplot(gs0[0,0])
ax1 = fig.add_subplot(gs0[0,1])
ax2 = fig.add_subplot(gs0[1,0])
ax3 = fig.add_subplot(gs0[1,1])
ax0.set_title(label='2017', fontsize='15')
ax1.set_title(label='2018', fontsize='15')
ax2.set_title(label='2019', fontsize='15')
ax3.set_title(label='2020', fontsize='15')
sns.heatmap(array_2017, ax=ax0, vmax = 8)
sns.heatmap(array_2018, ax=ax1, vmax = 8)
sns.heatmap(array_2019, ax=ax2, vmax = 8)
sns.heatmap(array_2020, ax=ax3, vmax = 8)
<AxesSubplot:title={'center':'2020'}>
The following analysis aims to highlight emerging trends in popular music by analyzing the correlation of certain musical features, such as energy, acousticness, loudness, and speechiness, with the platform's number of streams. The high correlation between streams and speechiness is a clear indication that Hip-Hop/Rap/Trap tracks are dominating the Spotify's top 200.
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
from IPython.display import display, HTML
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
df = pd.DataFrame()
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[15:]:
short_name = csv_file[15:]
#print(short_name)
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
year = short_name[0:4]
month = short_name[5:7]
day =short_name[8:10]
date = year + '-' + month + '-' + day
#print(date)
df_temp['date'] = date
df_temp.date = pd.to_datetime(df_temp.date)
df_temp.set_index(df_temp.date, inplace=True)
df_temp.drop(columns=['date'], inplace=True)
df = pd.concat([df, df_temp])
#print(list_data)
df.sort_index(inplace = True)
df_resampled = df.resample('W').mean()
df_relevant = df_resampled.drop(columns=['Unnamed: 0','Unnamed: 0.1','index','Position','duration_ms'])
df_relevant = df_relevant.filter(items=['Streams','energy','acousticness','loudness','speechiness'])
from scipy.stats import pearsonr
def corrfunc(x,y, ax=None, **kws):
"""Plot the correlation coefficient in the top left hand corner of a plot."""
r, _ = pearsonr(x, y)
ax = ax or plt.gca()
# Unicode for lowercase rho (ρ)
rho = '\u03C1'
ax.annotate(f'{rho} = {r:.2f}', xy=(.1, .9), xycoords=ax.transAxes)
def corrdot(*args, **kwargs):
corr_r = args[0].corr(args[1], 'pearson')
corr_text = f"{corr_r:2.2f}".replace("0.", ".")
ax = plt.gca()
ax.set_axis_off()
marker_size = abs(corr_r) * 10000
ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="Blues",
vmin=-1, vmax=1, transform=ax.transAxes)
font_size = abs(corr_r) * 40 + 5
ax.annotate(corr_text, [.5, .5,], xycoords="axes fraction",
ha='center', va='center', fontsize=font_size)
# g = sns.pairplot(stocks,palette=["Blues_d"])
g = sns.PairGrid(df_relevant, aspect=1.4, diag_sharey=False)
g.map_lower(corrfunc)
g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'Black','linewidth':1})
g.map_diag(sns.distplot, kde_kws={'color': 'Black','linewidth':1})
g.map_upper(corrdot)
plt.show()
import os
import pandas as pd
from IPython.display import display, HTML
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
avg_streams = df_temp["Streams"].mean()
#print(avg_streams)
week = short_name[-2:]
month =short_name[8:10]
date = month + '-' + week
#never append with a cycle to a dataframe. Create a list first and then convert to a dataframe https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe
temp = [avg_streams,date,year,number_of_weeks]
list_data.append(temp)
number_of_weeks=number_of_weeks+1
#print(list_data)
df = pd.DataFrame(list_data,columns = ["avg streams","date","year","number of weeks"])
There are some significant peaks in the avg_streams that we wish to analyze to unravel the causes of such spikes. Let us retrieve the top 200 of the weeks during which those peaks occur.
We retrieve the top 200 of these weeks and analyze it.
import os
import pandas as pd
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
file = os.path.join(file_folder,'it_2020-02-07--2020-02-14.csv')
df_week=pd.read_csv(file)
In the following plot we're going to analyze the stream peak in the 7-14 February 2020 as highlighted on the stream trend plot (the upper one).
The barchart shows the number of songs in the top 200 per artist, while the treemap shows the percentage of streams per artist.
The barchart shows a major number of tracks by Shiva. He released the Routine EP on the 31 January of 2020.
However, the barchart can be misleading.
There has indeed been a major release by Shiva (we will see later how releases impact streams), but in this case it's not the major reason for streams increment.
In fact, if we analyze the treemap, we can notice that there are many artists who have a good percentage of streams even if their number of tracks in the top 200 is less or equal to three.
Such artists are Diodato, Marracash, Fasma, Francesco Gabbani and Elettra Lamborghini.
The plots show also a major presence of ThaSupreme, his presence is given by it's major release in November 2019, which guaranteed him a long presence on the top charts (we will analyze his case later)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
import matplotlib
import squarify
artist_entries=df_week['Artist'].value_counts().rename_axis('Artist').to_frame('Releases')
#display(artist_entries)
artist_grouped=df_week.groupby('Artist')['Streams'].sum()
artist_grouped=artist_grouped.sort_values(ascending=False)
total_streams = artist_grouped.sum()
#print(total_streams)
#display(artist_grouped)
result = pd.merge(artist_entries, artist_grouped, on=['Artist'])
result['Streams'] = (result['Streams']/total_streams)*100
result['Artist'] = result.index
#display(result)
#for col in result.columns:
#print(col)
fig = plt.figure(figsize=(15,15))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.2)
with sns.axes_style("whitegrid"):
ax1 = fig.add_subplot(gs0[0,:])
with sns.axes_style("whitegrid"):
ax2 = fig.add_subplot(gs0[1,:])
#flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#DCDCDC", "#DCDCDC", "#DCDCDC","#EF553B"]
plot = sns.lineplot(ax=ax1, data=d1.avg_streams, color='years', palette=flatui)
#lg = ax1.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax1.get_lines()
line = lines[3]
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax1.tick_params(which="both", bottom=True)
ax1.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
props = {'marker':'o','markersize':15,'markeredgewidth':1.5,
'markeredgecolor':'black','markevery': [5], 'zorder': 5, 'linewidth':5}
line.set(**props)
#use plt.txt() to add label next to the hilighted point
ax1.set_title(label='Streams trend', fontsize='15')
ax1.set_xlabel(xlabel='Date', fontsize='15', style='italic')
ax1.set_ylabel(ylabel='Streams', fontsize='15', style='italic')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.axvspan(*mdates.datestr2num(['2018-03-09', '2018-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax1.legend(loc="lower right")
sort_by_streams = result.sort_values('Streams', ascending=False)
#unique = result["Artist"].head(10).append(sort_by_streams["Artist"].head(10)).unique()
#palette = dict(zip(unique, sns.color_palette("Set1",n_colors=len(unique))))
#palette.update({"Total":"k"})
palette=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(result.head(10)['Releases'])
sns.barplot(x="Artist", y="Releases", data = result.head(10), ax=ax2, palette=palette, saturation=1, edgecolor='grey')
ax2.set_xticklabels(
ax2.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax2.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax2.set_xlabel(xlabel='Artist', fontsize='15', style='italic')
ax2.set_ylabel(ylabel='# of tracks', fontsize='15', style='italic')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
# New dataframe, containing only players with more than 0 goals.
dataGoals = sort_by_streams[sort_by_streams["Streams"]>1.5]
#Utilise matplotlib to scale our goal numbers between the min and max, then assign this scale to our values.
norm = matplotlib.colors.Normalize(vmin=min(dataGoals.Streams), vmax=max(dataGoals.Streams))
#colors = [matplotlib.cm.Blues(norm(value)) for value in dataGoals.Streams]
colors=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(dataGoals.Streams)
#Create our plot and resize it.
fig1 = plt.figure()
ax = fig1.add_subplot()
fig1.set_size_inches(16, 10)
#Use squarify to plot our data, label it and add colours. We add an alpha layer to ensure black labels show through
labels = ["%s\n%.2f%% " % (label) for label in zip(dataGoals.Artist, dataGoals.Streams)]
squarify.plot(label=labels,sizes=dataGoals.Streams, color = colors, alpha=.7, bar_kwargs=dict(linewidth=0.5, edgecolor="#222222"),text_kwargs={'fontsize':13, 'wrap':True})
plt.title("Streams Percentage",fontsize=23,fontweight="bold")
ax1.set_xlim([datetime.date(2018, 1, 1), datetime.date(2018, 12, 31)]) # BUGGGGGGGGGGG
#Remove our axes and display the plot
plt.axis('off')
plt.show()
The following result should make ourselves ask: Why are there some artist like Diodato who have a quite high percentage of streams even if they have three or less tracks in the top 200?
As we might notice from the following top 10, the first top 8 songs come from Sanremo Festival, with Diodato in first position, which happens to be the winner of Sanremo 2020.
df_week.filter(['Track Name','Artist','Streams', 'genre']).head(20)
| Track Name | Artist | Streams | genre | |
|---|---|---|---|---|
| 0 | Fai rumore | Diodato | 4268423 | Pop |
| 1 | Ringo Starr | Pinguini Tattici Nucleari | 3840527 | Pop |
| 2 | Me ne frego | Achille Lauro | 3309741 | Hip-Hop/Rap |
| 3 | Per sentirmi vivo | Fasma | 3192031 | Hip-Hop/Rap |
| 4 | Viceversa | Francesco Gabbani | 3144858 | Pop |
| 5 | Musica (E Il Resto Scompare) | Elettra Lamborghini | 3035842 | Urbano latino |
| 6 | Andromeda | Elodie | 2380992 | Hip-Hop/Rap |
| 7 | Tikibombom | Levante | 2215022 | Pop |
| 8 | blun7 a swishland | tha Supreme | 2010716 | Hip-Hop/Rap |
| 9 | Boogieman (feat. Salmo) | Ghali | 2001335 | Hip-Hop/Rap |
| 10 | Eden (feat. Dardust) | Rancore | 1892951 | Hip-Hop/Rap |
| 11 | Rapide | Mahmood | 1606138 | Pop |
| 12 | Calmo - feat. tha Supreme | Shiva | 1554386 | none |
| 13 | ANSIA NO | FSK SATELLITE | 1553983 | Hip-Hop/Rap |
| 14 | Rosso di rabbia | Anastasio | 1446845 | Hip-Hop/Rap |
| 15 | Dov'è | Le Vibrazioni | 1418915 | Pop |
| 16 | Dance Monkey | Tones And I | 1372288 | Alternative |
| 17 | Vai Bene Così | Leo Gassmann | 1305747 | Pop |
| 18 | fuck 3x | tha Supreme | 1292623 | Hip-Hop/Rap |
| 19 | Ti volevo dedicare (feat. J-AX & Boomdabash) | Rocco Hunt | 1276149 | Hip-Hop/Rap |
Now we're going to analyze the same week, but in 2019. We're analyzing the period 8-15 February 2019
import os
import pandas as pd
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
file = os.path.join(file_folder,'it_2019-02-08--2019-02-15.csv')
df_week=pd.read_csv(file)
The histogram on the left shows a major number of tracks by Madman. He released the MM Vol.3 Mixtape on the 1 February of 2019.
However, if we look at the histogram containing the percentage of streams, we can see that Madman is only in third position, with Ultimo and Mahmood in first and second position respectively.
Mahmood won the Sanremo Festival 2019 while Ultimo ended up taking the second place.
There has also been a lot of discussion and criticisms about Mahmood first place. In fact, the favourite winner by the public was Ultimo with the 48,80% of votes, while Mahmood got only the 20,95% (source: https://www.sorrisi.com/musica/sanremo/sanremo-2019-analisi-del-televoto-e-dati-delle-giuri/).
The technical jury decided to assign the first place to Mahmood, this generated a lot of criticisms and discussion on public media, which guaranteed Ultimo a huge visibility and streams on Spotify.
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
artist_entries=df_week['Artist'].value_counts().rename_axis('Artist').to_frame('Releases')
#display(artist_entries)
artist_grouped=df_week.groupby('Artist')['Streams'].sum()
artist_grouped=artist_grouped.sort_values(ascending=False)
total_streams = artist_grouped.sum()
#print(total_streams)
#display(artist_grouped)
result = pd.merge(artist_entries, artist_grouped, on=['Artist'])
result['Streams'] = (result['Streams']/total_streams)*100
result['Artist'] = result.index
#display(result)
#for col in result.columns:
#print(col)
fig = plt.figure(figsize=(15,15))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.2)
with sns.axes_style("whitegrid"):
ax1 = fig.add_subplot(gs0[0,:])
with sns.axes_style("whitegrid"):
ax2 = fig.add_subplot(gs0[1,:])
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#DCDCDC", "#DCDCDC", "#AB63FA","#DCDCDC"]
plot = sns.lineplot(ax=ax1, data=d1.avg_streams, color='years', palette=flatui)
#lg = ax1.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax1.get_lines()
line = lines[2]
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax1.tick_params(which="both", bottom=True)
ax1.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
props = {'marker':'o','markersize':15,'markeredgewidth':1.5,
'markeredgecolor':'black','markevery': [5], 'zorder': 5, 'linewidth':5}
line.set(**props)
#use plt.txt() to add label next to the hilighted point
ax1.set_title(label='Streams trend', fontsize='15')
ax1.set_xlabel(xlabel='Date', fontsize='15', style='italic')
ax1.set_ylabel(ylabel='Streams', fontsize='15', style='italic')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
#ax1.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax1.legend(loc="lower right")
sort_by_streams = result.sort_values('Streams', ascending=False)
#unique = result["Artist"].head(10).append(sort_by_streams["Artist"].head(10)).unique()
#palette = dict(zip(unique, sns.color_palette("Set1",n_colors=len(unique))))
#palette.update({"Total":"k"})
palette=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(result.head(10)['Releases'])
sns.barplot(x="Artist", y="Releases", data = result.head(10), ax=ax2, palette=palette, saturation=1, edgecolor='grey')
ax2.set_xticklabels(
ax2.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax2.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax2.set_xlabel(xlabel='Artist', fontsize='15', style='italic')
ax2.set_ylabel(ylabel='# of tracks', fontsize='15', style='italic')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
# New dataframe, containing only players with more than 0 goals.
dataGoals = sort_by_streams[sort_by_streams["Streams"]>1.5]
#Utilise matplotlib to scale our goal numbers between the min and max, then assign this scale to our values.
norm = matplotlib.colors.Normalize(vmin=min(dataGoals.Streams), vmax=max(dataGoals.Streams))
colors = [matplotlib.cm.Blues(norm(value)) for value in dataGoals.Streams]
#Create our plot and resize it.
fig1 = plt.figure()
ax = fig1.add_subplot()
fig1.set_size_inches(16, 10)
#Use squarify to plot our data, label it and add colours. We add an alpha layer to ensure black labels show through
labels = ["%s\n%.2f%% " % (label) for label in zip(dataGoals.Artist, dataGoals.Streams)]
squarify.plot(label=labels,sizes=dataGoals.Streams, color = colors, alpha=.7, bar_kwargs=dict(linewidth=0.5, edgecolor="#222222"),text_kwargs={'fontsize':13, 'wrap':True})
plt.title("Streams Percentage",fontsize=23,fontweight="bold")
ax1.set_xlim([datetime.date(2018, 1, 1), datetime.date(2018, 12, 31)])
#Remove our axes and display the plot
plt.axis('off')
plt.show()
If we plot the top 200 weekly dataset, we can notice that, out of the top 10 songs, 6 of them are from Sanremo Festival.
df_week.filter(['Track Name','Artist','Streams', 'genre']).head(10)
| Track Name | Artist | Streams | genre | |
|---|---|---|---|---|
| 0 | Soldi | Mahmood | 8223614 | Pop |
| 1 | I Tuoi Particolari | Ultimo | 5902025 | Pop |
| 2 | La ragazza con il cuore di latta | Irama | 4458947 | Pop |
| 3 | Senza farlo apposta | Shade | 2919684 | Hip-Hop/Rap |
| 4 | Rolls Royce (feat. Boss Doms & Frenetik&Orang3) | Achille Lauro | 2848988 | Hip-Hop/Rap |
| 5 | Per Un Milione | Boomdabash | 2456164 | Pop |
| 6 | È sempre bello | Coez | 2216325 | Pop |
| 7 | IL CIELO NELLA STANZA (feat. NSTASIA) | Salmo | 2001997 | Hip-Hop/Rap |
| 8 | Sweet but Psycho | Ava Max | 1884427 | Pop |
| 9 | Calma - Remix | Pedro Capó | 1859418 | Pop Latino |
It is indeed clearer how the streams spiking up in February in Italy are generally due to new releases and, more importantly, to Sanremo Festival. This analysis reveals the correlation between musical events (festivals, awards, etc.) and stream trends that will be further confirmed in subsequent analysis.
We can notice a huge spike in the number of streams on this week.
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
file = os.path.join(file_folder,'it_2019-07-05--2019-07-12.csv')
df_week=pd.read_csv(file)
We can notice there's a big number of track from MACHETE. In fact on the 5 july 2019 the MACHETE Mixtape 4 has been released, which had a huge success, with major collaborations between big rappers in the italian music scene.
Also, more than the 30% of the weekly streams are given by MACHETE.
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
artist_entries=df_week['Artist'].value_counts().rename_axis('Artist').to_frame('Releases')
#display(artist_entries)
artist_grouped=df_week.groupby('Artist')['Streams'].sum()
artist_grouped=artist_grouped.sort_values(ascending=False)
total_streams = artist_grouped.sum()
#print(total_streams)
#display(artist_grouped)
result = pd.merge(artist_entries, artist_grouped, on=['Artist'])
result['Streams'] = (result['Streams']/total_streams)*100
result['Artist'] = result.index
#display(result)
#for col in result.columns:
#print(col)
fig = plt.figure(figsize=(15,15))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.2)
with sns.axes_style("whitegrid"):
ax1 = fig.add_subplot(gs0[0,:])
with sns.axes_style("whitegrid"):
ax2 = fig.add_subplot(gs0[1,:])
#flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#DCDCDC", "#DCDCDC", "#AB63FA","#DCDCDC"]
plot = sns.lineplot(ax=ax1, data=d1.avg_streams, color='years', palette=flatui)
#lg = ax1.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax1.get_lines()
line = lines[2]
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax1.tick_params(which="both", bottom=True)
ax1.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
#ax1.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
props = {'marker':'o','markersize':15,'markeredgewidth':1.5,
'markeredgecolor':'black','markevery': [26],'zorder': 5, 'linewidth':5}
line.set(**props)
#use plt.txt() to add label next to the hilighted point
ax1.set_title(label='Streams trend', fontsize='15')
ax1.set_xlabel(xlabel='# of weeks', fontsize='15', style='italic')
ax1.set_ylabel(ylabel='Streams', fontsize='15', style='italic')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.legend(loc="lower right")
sort_by_streams = result.sort_values('Streams', ascending=False)
#unique = result["Artist"].head(10).append(sort_by_streams["Artist"].head(10)).unique()
#palette = dict(zip(unique, sns.color_palette("Set1",n_colors=len(unique))))
#palette.update({"Total":"k"})
palette=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(result.head(10)['Releases'])
sns.barplot(x="Artist", y="Releases", data = result.head(10), ax=ax2, palette=palette, saturation=1, edgecolor='grey')
ax2.set_xticklabels(
ax2.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax2.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax2.set_xlabel(xlabel='Artist', fontsize='15', style='italic')
ax2.set_ylabel(ylabel='# of tracks', fontsize='15', style='italic')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
# New dataframe, containing only players with more than 0 goals.
dataGoals = sort_by_streams[sort_by_streams["Streams"]>1.5]
#Utilise matplotlib to scale our goal numbers between the min and max, then assign this scale to our values.
norm = matplotlib.colors.Normalize(vmin=min(dataGoals.Streams), vmax=max(dataGoals.Streams))
colors = [matplotlib.cm.Blues(norm(value)) for value in dataGoals.Streams]
#Create our plot and resize it.
fig1 = plt.figure()
ax = fig1.add_subplot()
fig1.set_size_inches(16, 10)
#Use squarify to plot our data, label it and add colours. We add an alpha layer to ensure black labels show through
labels = ["%s\n%.2f%% " % (label) for label in zip(dataGoals.Artist, dataGoals.Streams)]
squarify.plot(label=labels,sizes=dataGoals.Streams, color = colors, alpha=.7, bar_kwargs=dict(linewidth=0.5, edgecolor="#222222"),text_kwargs={'fontsize':13, 'wrap':True})
plt.title("Streams Percentage",fontsize=23,fontweight="bold")
ax1.set_xlim([datetime.date(2018, 1, 1), datetime.date(2018, 12, 31)])
#Remove our axes and display the plot
plt.axis('off')
plt.show()
We can also notice that almost the whole top 10 comes from that album.
df_week.filter(['Track Name','Artist','Streams']).head(10)
| Track Name | Artist | Streams | |
|---|---|---|---|
| 0 | YOSHI (feat. Fabri Fibra) - prod. Strage | MACHETE | 5816357 |
| 1 | HO PAURA DI USCIRE 2 - prod. Mace | MACHETE | 5489578 |
| 2 | MARYLEAN (feat. Marracash) - prod. Low Kidd | MACHETE | 4829415 |
| 3 | STAR WARS - prod. Young Miles - Crookers x Nic... | MACHETE | 4098318 |
| 4 | MAMMASTOMALE (feat. Salmo) - prod. Dade | MACHETE | 3680698 |
| 5 | NO WAY (feat. Nitro) - prod. Low Kidd | MACHETE | 3555978 |
| 6 | IO PUO' (feat. Salmo) - prod. Low Kidd | MACHETE | 3094979 |
| 7 | Una volta ancora (feat. Ana Mena) | Fred De Palma | 3056237 |
| 8 | JAMBO | Takagi & Ketra | 3054732 |
| 9 | Dove e quando | Benji & Fede | 3008972 |
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
file = os.path.join(file_folder,'it_2019-11-15--2019-11-22.csv')
df_week=pd.read_csv(file)
In the left histogram we can notice 20 songs from Tha Supreme. He released highly anticipated debut album 236451 on the 15 November 2019, which had a huge success.
Almost the 35% of the weekly top 200 streams has been generated by Tha Supreme followed by Marracash who release his album Persona the previous week.
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
artist_entries=df_week['Artist'].value_counts().rename_axis('Artist').to_frame('Releases')
#display(artist_entries)
artist_grouped=df_week.groupby('Artist')['Streams'].sum()
artist_grouped=artist_grouped.sort_values(ascending=False)
total_streams = artist_grouped.sum()
#print(total_streams)
#display(artist_grouped)
result = pd.merge(artist_entries, artist_grouped, on=['Artist'])
result['Streams'] = (result['Streams']/total_streams)*100
result['Artist'] = result.index
#display(result)
#for col in result.columns:
#print(col)
fig = plt.figure(figsize=(15,15))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.2)
with sns.axes_style("whitegrid"):
ax1 = fig.add_subplot(gs0[0,:])
with sns.axes_style("whitegrid"):
ax2 = fig.add_subplot(gs0[1,:])
#flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#DCDCDC", "#DCDCDC", "#AB63FA","#DCDCDC"]
plot = sns.lineplot(ax=ax1, data=d1.avg_streams, color='years', palette=flatui)
lines = ax1.get_lines()
line = lines[2]
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax1.tick_params(which="both", bottom=True)
ax1.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
#ax1.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
props = {'marker':'o','markersize':15,'markeredgewidth':1.5,
'markeredgecolor':'black','markevery': [45], 'zorder': 5, 'linewidth':5}
line.set(**props)
#use plt.txt() to add label next to the hilighted point
ax1.set_title(label='Streams trend', fontsize='15')
ax1.set_xlabel(xlabel='# of weeks', fontsize='15', style='italic')
ax1.set_ylabel(ylabel='Streams', fontsize='15', style='italic')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.legend(loc="lower right")
sort_by_streams = result.sort_values('Streams', ascending=False)
#unique = result["Artist"].head(10).append(sort_by_streams["Artist"].head(10)).unique()
#palette = dict(zip(unique, sns.color_palette("Set1",n_colors=len(unique))))
#palette.update({"Total":"k"})
palette=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(result.head(10)['Releases'])
sns.barplot(x="Artist", y="Releases", data = result.head(10), ax=ax2, palette=palette, saturation=1, edgecolor='grey')
ax2.set_xticklabels(
ax2.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax2.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax2.set_xlabel(xlabel='Artist', fontsize='15', style='italic')
ax2.set_ylabel(ylabel='# of tracks', fontsize='15', style='italic')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
# New dataframe, containing only players with more than 0 goals.
dataGoals = sort_by_streams[sort_by_streams["Streams"]>1.5]
#Utilise matplotlib to scale our goal numbers between the min and max, then assign this scale to our values.
norm = matplotlib.colors.Normalize(vmin=min(dataGoals.Streams), vmax=max(dataGoals.Streams))
colors = [matplotlib.cm.Blues(norm(value)) for value in dataGoals.Streams]
#Create our plot and resize it.
fig1 = plt.figure()
ax = fig1.add_subplot()
fig1.set_size_inches(16, 10)
#Use squarify to plot our data, label it and add colours. We add an alpha layer to ensure black labels show through
labels = ["%s\n%.2f%% " % (label) for label in zip(dataGoals.Artist, dataGoals.Streams)]
squarify.plot(label=labels,sizes=dataGoals.Streams, color = colors, alpha=.7, bar_kwargs=dict(linewidth=0.5, edgecolor="#222222"),text_kwargs={'fontsize':13, 'wrap':True})
plt.title("Streams Percentage",fontsize=23,fontweight="bold")
ax1.set_xlim([datetime.date(2018, 1, 1), datetime.date(2018, 12, 31)])
#Remove our axes and display the plot
plt.axis('off')
plt.show()
As we can see, basically the whole top 10 is by Tha Supreme
df_week.filter(['Track Name','Artist','Streams']).head(10)
| Track Name | Artist | Streams | |
|---|---|---|---|
| 0 | blun7 a swishland | tha Supreme | 7429539 |
| 1 | fuck 3x | tha Supreme | 4668644 |
| 2 | sw1n6o - feat. Salmo | tha Supreme | 3954844 |
| 3 | m12ano - feat. Mara Sattei | tha Supreme | 3555240 |
| 4 | SUPREME - L'ego (feat. tha Supreme & Sfera Ebb... | Marracash | 3381296 |
| 5 | occh1 purpl3 - feat. Marracash | tha Supreme | 3329403 |
| 6 | no14 - feat. Dani Faiv | tha Supreme | 3081602 |
| 7 | come fa1 | tha Supreme | 3019409 |
| 8 | ch1 5ei te | tha Supreme | 3012034 |
| 9 | 2ollipop | tha Supreme | 2901181 |
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
file = os.path.join(file_folder,'it_2018-11-09--2018-11-16.csv')
df_week=pd.read_csv(file)
15 tracks by Salmo are present in the top 200. In fact his album Playlist has been released on the 9 November 2018
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
artist_entries=df_week['Artist'].value_counts().rename_axis('Artist').to_frame('Releases')
#display(artist_entries)
artist_grouped=df_week.groupby('Artist')['Streams'].sum()
artist_grouped=artist_grouped.sort_values(ascending=False)
total_streams = artist_grouped.sum()
#print(total_streams)
#display(artist_grouped)
result = pd.merge(artist_entries, artist_grouped, on=['Artist'])
result['Streams'] = (result['Streams']/total_streams)*100
result['Artist'] = result.index
#display(result)
#for col in result.columns:
#print(col)
fig = plt.figure(figsize=(15,15))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.2)
with sns.axes_style("whitegrid"):
ax1 = fig.add_subplot(gs0[0,:])
with sns.axes_style("whitegrid"):
ax2 = fig.add_subplot(gs0[1,:])
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#DCDCDC", "#00CC96", "#DCDCDC","#DCDCDC"]
plot = sns.lineplot(ax=ax1, data=d1.avg_streams, color='years', palette=flatui)
lines = ax1.get_lines()
line = lines[1]
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax1.tick_params(which="both", bottom=True)
ax1.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
#ax1.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
props = {'marker':'o','markersize':15,'markeredgewidth':1.5,
'markeredgecolor':'black','markevery': [44], 'linewidth':5,'zorder': 5}
line.set(**props)
#use plt.txt() to add label next to the hilighted point
ax1.set_title(label='Streams trend', fontsize='15')
ax1.set_xlabel(xlabel='# of weeks', fontsize='15', style='italic')
ax1.set_ylabel(ylabel='Streams', fontsize='15', style='italic')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.legend(loc="lower right")
sort_by_streams = result.sort_values('Streams', ascending=False)
#unique = result["Artist"].head(10).append(sort_by_streams["Artist"].head(10)).unique()
#palette = dict(zip(unique, sns.color_palette("Set1",n_colors=len(unique))))
#palette.update({"Total":"k"})
palette=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(result.head(10)['Releases'])
sns.barplot(x="Artist", y="Releases", data = result.head(10), ax=ax2, palette=palette, saturation=1, edgecolor='grey')
ax2.set_xticklabels(
ax2.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax2.yaxis.set_major_locator(ticker.MultipleLocator(1))
ax2.set_xlabel(xlabel='Artist', fontsize='15', style='italic')
ax2.set_ylabel(ylabel='# of tracks', fontsize='15', style='italic')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
# New dataframe, containing only players with more than 0 goals.
dataGoals = sort_by_streams[sort_by_streams["Streams"]>1.5]
#Utilise matplotlib to scale our goal numbers between the min and max, then assign this scale to our values.
norm = matplotlib.colors.Normalize(vmin=min(dataGoals.Streams), vmax=max(dataGoals.Streams))
colors = [matplotlib.cm.Blues(norm(value)) for value in dataGoals.Streams]
#Create our plot and resize it.
fig1 = plt.figure()
ax = fig1.add_subplot()
fig1.set_size_inches(16, 10)
#Use squarify to plot our data, label it and add colours. We add an alpha layer to ensure black labels show through
labels = ["%s\n%.2f%% " % (label) for label in zip(dataGoals.Artist, dataGoals.Streams)]
squarify.plot(label=labels,sizes=dataGoals.Streams, color = colors, alpha=.7, bar_kwargs=dict(linewidth=0.5, edgecolor="#222222"),text_kwargs={'fontsize':15, 'wrap':True})
plt.title("Streams Percentage",fontsize=23,fontweight="bold")
ax1.set_xlim([datetime.date(2018, 1, 1), datetime.date(2018, 12, 31)])
#Remove our axes and display the plot
plt.axis('off')
plt.show()
We can easily see that almost the whole top 10 is taken by Salmo.
df_week.filter(['Track Name','Artist','Streams']).head(10)
| Track Name | Artist | Streams | |
|---|---|---|---|
| 0 | CABRIOLET (feat. Sfera Ebbasta) | Salmo | 5434298 |
| 1 | STAI ZITTO (feat. Fabri Fibra) | Salmo | 5195708 |
| 2 | IL CIELO NELLA STANZA (feat. NSTASIA) | Salmo | 5163645 |
| 3 | LUNEDI' | Salmo | 3859383 |
| 4 | HO PAURA DI USCIRE | Salmo | 3493788 |
| 5 | SPARARE ALLA LUNA (feat. Coez) | Salmo | 3475966 |
| 6 | RICCHI E MORTI | Salmo | 3390260 |
| 7 | 90MIN | Salmo | 3216286 |
| 8 | DISPOVERY CHANNEL (feat. Nitro) | Salmo | 3118643 |
| 9 | Torna a casa | Måneskin | 2797833 |
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
file = os.path.join(file_folder,'it_2018-01-19--2018-01-26.csv')
df_week=pd.read_csv(file)
We can easily spot 18 tracks by Sfera Ebbasta. He released his album Rockstar on the 19 Janruary 2018, which has been a huge success in the italian trap scene.
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
artist_entries=df_week['Artist'].value_counts().rename_axis('Artist').to_frame('Releases')
#display(artist_entries)
artist_grouped=df_week.groupby('Artist')['Streams'].sum()
artist_grouped=artist_grouped.sort_values(ascending=False)
total_streams = artist_grouped.sum()
#print(total_streams)
#display(artist_grouped)
result = pd.merge(artist_entries, artist_grouped, on=['Artist'])
result['Streams'] = (result['Streams']/total_streams)*100
result['Artist'] = result.index
#display(result)
#for col in result.columns:
#print(col)
fig = plt.figure(figsize=(15,15))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.2)
with sns.axes_style("whitegrid"):
ax1 = fig.add_subplot(gs0[0,:])
with sns.axes_style("whitegrid"):
ax2 = fig.add_subplot(gs0[1,:])
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#DCDCDC", "#00CC96", "#DCDCDC","#DCDCDC"]
plot = sns.lineplot(ax=ax1, data=d1.avg_streams, color='years', palette=flatui)
lines = ax1.get_lines()
line = lines[1]
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax1.tick_params(which="both", bottom=True)
ax1.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
#ax1.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
props = {'marker':'o','markersize':15,'markeredgewidth':1.5,
'markeredgecolor':'black','markevery': [2], 'linewidth':5,'zorder': 5}
line.set(**props)
#use plt.txt() to add label next to the hilighted point
ax1.set_title(label='Streams trend', fontsize='15')
ax1.set_xlabel(xlabel='# of weeks', fontsize='15', style='italic')
ax1.set_ylabel(ylabel='Streams', fontsize='15', style='italic')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.legend(loc="lower right")
sort_by_streams = result.sort_values('Streams', ascending=False)
#unique = result["Artist"].head(10).append(sort_by_streams["Artist"].head(10)).unique()
#palette = dict(zip(unique, sns.color_palette("Set1",n_colors=len(unique))))
#palette.update({"Total":"k"})
palette=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(result.head(10)['Releases'])
sns.barplot(x="Artist", y="Releases", data = result.head(10), ax=ax2, palette=palette, saturation=1, edgecolor='grey')
ax2.set_xticklabels(
ax2.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax2.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax2.set_xlabel(xlabel='Artist', fontsize='15', style='italic')
ax2.set_ylabel(ylabel='# of tracks', fontsize='15', style='italic')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
# New dataframe, containing only players with more than 0 goals.
dataGoals = sort_by_streams[sort_by_streams["Streams"]>1.5]
#Utilise matplotlib to scale our goal numbers between the min and max, then assign this scale to our values.
norm = matplotlib.colors.Normalize(vmin=min(dataGoals.Streams), vmax=max(dataGoals.Streams))
colors = [matplotlib.cm.Blues(norm(value)) for value in dataGoals.Streams]
#Create our plot and resize it.
fig1 = plt.figure()
ax = fig1.add_subplot()
fig1.set_size_inches(16, 10)
#Use squarify to plot our data, label it and add colours. We add an alpha layer to ensure black labels show through
labels = ["%s\n%.2f%% " % (label) for label in zip(dataGoals.Artist, dataGoals.Streams)]
squarify.plot(label=labels,sizes=dataGoals.Streams, color = colors, alpha=.7, bar_kwargs=dict(linewidth=0.5, edgecolor="#222222"),text_kwargs={'fontsize':15, 'wrap':True})
plt.title("Streams Percentage",fontsize=23,fontweight="bold")
ax1.set_xlim([datetime.date(2018, 1, 1), datetime.date(2018, 12, 31)])
#Remove our axes and display the plot
plt.axis('off')
plt.show()
The whole top 10 is taken by Sfera Ebbasta.
df_week.filter(['Track Name','Artist','Streams']).head(10)
| Track Name | Artist | Streams | |
|---|---|---|---|
| 0 | Cupido (feat. Quavo) | Sfera Ebbasta | 5105410 |
| 1 | Rockstar | Sfera Ebbasta | 4722530 |
| 2 | Sciroppo (feat. DrefGold) | Sfera Ebbasta | 4156428 |
| 3 | Ricchi x Sempre | Sfera Ebbasta | 3552712 |
| 4 | Serpenti A Sonagli | Sfera Ebbasta | 3381143 |
| 5 | Uber | Sfera Ebbasta | 2894123 |
| 6 | XNX | Sfera Ebbasta | 2552249 |
| 7 | Leggenda | Sfera Ebbasta | 2216057 |
| 8 | 20 Collane | Sfera Ebbasta | 2071546 |
| 9 | Bancomat | Sfera Ebbasta | 2051597 |
To conclude, we can observe a correlation between the number of streams on Spotify and major releases and musical events.
This correlation becomes more evident than before in the last two years, with Spotify becoming a widespread platform used daily by its users. In 2017 and the beginning of 2018, the service's popularity was still growing, as we can see by the constant increase in the number of streams.
Limited to Italy, most spikes were given by Italian Hip-Hop/Rap/Trap artists, as we have already anticipated with our correlation plot, which showed a high correlation between speechiness and streams.
These charts reported the analysis only of a limited number of peaks for sake of clarity. The analysis has been performed on all peaks in the plot.
We also identified a decreasing trend in the number of streams during quarantine/pandemic.
Considering the previous results, one should naturally ask himself: "Is this decreasing trend caused directly by the pandemic, or is it caused by the lack of new releases and musical events?"
We will analyze the number of releases in the pandemic period and compare it with the number of releases in the same period in previous years.
Now we're going to analyze the number of new releases during the quarantine period related to Hip-Hop/Rap genre.
import os
import pandas as pd
from IPython.display import display, HTML
import seaborn as sns
import matplotlib.pyplot as plt
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
df = pd.DataFrame(list_data,columns = ["danceability","energy","loudness","tempo","valence","acousticness","date","year","number_of_weeks", "release date", "genre"])
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
df_temp=df_temp.filter(["danceability","energy","loudness","tempo","valence", "acousticness", "release date","genre"])
#display(df_temp)
#avg_streams = df_temp["Streams"].sum()
#print(avg_streams)
week = short_name[-2:]
month =short_name[8:10]
date = month + '-' + week
temp = [date,year,number_of_weeks]
df_temp['date'] = date
df_temp['year'] = year
df_temp['number_of_weeks'] = number_of_weeks
#display(df_temp)
frames = [df,df_temp]
df = pd.concat(frames)
number_of_weeks=number_of_weeks+1
df=df[df["genre"]=='Hip-Hop/Rap']
df_trend_releases = df.filter(items=['year','number_of_weeks', 'date', 'release date','genre'])
#Outputs a dataframe with the number of releases in each week
import datetime
df_trend_releases = df.filter(items=['year','number_of_weeks', 'date', 'release date','genre'])
date_format = "%Y-%m-%d"
years=[2017,2018,2019,2020]
data = []
for year in years:
current_year = df_trend_releases.loc[df_trend_releases['year'] == year]
weeks = current_year['number_of_weeks'].unique()
#print(weeks)
#print(weeks)
for week in weeks:
current_week = current_year.loc[current_year['number_of_weeks'] == week]
#display(current_week)
#print(current_week['date'].iloc[0])
lower_date = str(year)+'-'+ current_week['date'].iloc[0]
#print(lower_date)
d0 = datetime.datetime.strptime(lower_date, date_format).date()
d1 = d0+datetime.timedelta(days=7)
#print()
#print(str(d0) + str(d1))
number_of_releases=0
for index, row in current_week.iterrows():
#print(index)
date = row["release date"]
#print(date)
try:
date=datetime.datetime.strptime(date, '%Y-%m-%d').date()
if d0 <= date < d1:
#print('in between')
number_of_releases=number_of_releases+1
except ValueError:
#print('Incorrect format ' + str(date))
pass
temp = [current_week['date'].iloc[0],year,week,number_of_releases]
data.append(temp)
#print(data)
In the following table we have the total number of releases for each year in the period from the 6th of March to the 15th of May, which spans the entire lockdown period in Italy. (sources= http://www.salute.gov.it/portale/nuovocoronavirus/dettaglioNotizieNuovoCoronavirus.jsp?lingua=italiano&menu=notizie&p=dalministero&id=4184 and https://www.rainews.it/dl/rainews/articoli/coronavirus-Fase-2-ecco-come-si-riparte-il-18-maggio-6a0c52c3-2f99-4170-95de-e6d221e7ff21.html )
releases_dataframe = pd.DataFrame(data, columns=['date','year','week','number of releases'])
releases_dataframe = releases_dataframe[releases_dataframe['week'].between(9, 19)]
releases_dataframe=releases_dataframe.drop(columns=['week'])
releases_dataframe.groupby('year').sum()
| number of releases | |
|---|---|
| year | |
| 2017 | 78 |
| 2018 | 118 |
| 2019 | 117 |
| 2020 | 55 |
The following plot shows the number of new releases in the period from 03-06 to 05-15 for each year.
As we can see in the plot, we have an increasing trend in the number of releases in the different years which drops in 2020.
import matplotlib.ticker as ticker
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
fig, ax = plt.subplots(figsize=(15, 7))
ax = sns.lineplot(x="year", y="number of releases", data=releases_dataframe, marker='o', markersize=13, estimator=sum, ci=None, linewidth = 2.5)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.set_xlabel(xlabel='Year', fontsize=15)
ax.set_ylabel(ylabel='Number of new releases', fontsize=15)
ax.set_title(label='New releases from 03-06 to 05-15 per year', fontsize=15)
Text(0.5, 1.0, 'New releases from 03-06 to 05-15 per year')
By looking at the plot above, we can notice a drastic decrease in the number of releases in 2020. As discussed previously, most of the significant peaks in the number of streams are related to Italian rappers' influential album releases.
We first build a dataframe which contains the number of new releases per artist in the period from 03-06 to 05-15 for each year. For example we will end up with something like:
| Artist | Year | New |
| Queen | 1972 | 15 |
| Queen | 1980 | 12 |
| Michael Jackson | 1979 | 10 |
This means that Queen released 15 new tracks in the period from 03-06-1972 to 05-15-1972, they also released 12 new tracks in the period from 03-06-1980 to 05-15-1980 and Michael Jackson released 10 new tracks in the period 03-06-1979 to 05-15-1979 and so on.
# Selecting the lockdown period and the plotting the artists with a high number of releases in that period
import os
import pandas as pd
from IPython.display import display, HTML
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
date_format = "%Y-%m-%d"
df = pd.DataFrame(list_data,columns = ["Artist","Track Name","date","year","number_of_weeks", "release date", "new","genre"])
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
df_temp['new']=0
df_temp=df_temp.filter(["Artist","Track Name", "release date", "new","genre"])
week = short_name[-2:]
month =short_name[8:10]
date = month + '-' + week
temp = [date,year,number_of_weeks]
df_temp['date'] = date
df_temp['year'] = year
df_temp['number_of_weeks'] = number_of_weeks
lower_date = csv_file[3:13]
#print(lower_date)
d0 = datetime.datetime.strptime(lower_date, date_format).date()
d1 = d0+datetime.timedelta(days=7)
for index, row in df_temp.iterrows():
#print(index)
date = row["release date"]
#print(date)
try:
date=datetime.datetime.strptime(date, '%Y-%m-%d').date()
if d0 <= date < d1:
#print('in between')
df_temp.iloc[index, df_temp.columns.get_loc('new')]= 1
except ValueError:
#print('Incorrect format')
pass
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#display(test)
#display(df_temp)
#avg_streams = df_temp["Streams"].sum()
#print(avg_streams)
#display(df_temp)
frames = [df,df_temp]
df = pd.concat(frames)
number_of_weeks=number_of_weeks+1
#print(list_data)
lower_bound = 9
upper_bound = 19
df = df[(df['number_of_weeks'] >= lower_bound) & (df['number_of_weeks'] <= upper_bound)]
df = df[df['genre']=='Hip-Hop/Rap']
new_df = df.groupby(['Artist',"year"], sort=True).sum().reset_index()
new_df = new_df.sort_values(by = ['new'], ascending=[False])
new_df=new_df.filter(items=['Artist',"year","new"])
As we can see, the number of Italian releases in the period considered, in 2017, 2018, and 2019 are quite a lot if compared with 2020, which had only a release by Nitro right at the beginning of the quarantine period: he released his album GarbAge on the 6th March 2020.
The most acute observer could notice something peculiar: we have a decent number of new releases from Ghali and Marracash in the period from 03-06-2020 to 05-15-2020, but Ghali released his album DNA on the 21st of February 2020 while Marracash released Persona on the 31st October 2019.
Someone might wonder why there are releases by artists that do not correspond to the release of new albums and are neither new singles. We will analyze this particular situation in the following.
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
import matplotlib
#Forse si può togliere qualche label
year_2017 = new_df.loc[new_df['year'] == 2017]
year_2018 = new_df.loc[new_df['year'] == 2018]
year_2019 = new_df.loc[new_df['year'] == 2019]
year_2020 = new_df.loc[new_df['year'] == 2020]
fig = plt.figure(figsize=(15,12))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.7)
with sns.axes_style("whitegrid"):
ax1 = fig.add_subplot(gs0[0,0])
ax2 = fig.add_subplot(gs0[0,1])
ax3 = fig.add_subplot(gs0[1,0])
ax4 = fig.add_subplot(gs0[1,1])
palette=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(year_2017.head(10)['new'])
sns.barplot(x="Artist", y="new", data = year_2017.head(10), ax=ax1, palette=palette, saturation=1,edgecolor='grey')
ax1.set_xticklabels(
ax1.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax1.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax1.set_xlabel('Artist', fontsize=15, style='italic',color='#4f4e4e')
ax1.set_ylabel('Number of new releases', fontsize=15, style='italic',color='#4f4e4e')
ax1.set_title('03-06-2017 to 05-15-2017', fontsize=15, fontweight='bold',color='#4f4e4e')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.set(ylim=(0, 20))
palette=matplotlib.cm.ScalarMappable(cmap='Purples').to_rgba(year_2018.head(10)['new'])
sns.barplot(x="Artist", y="new", data = year_2018.head(10), ax=ax2, palette=palette, saturation=1,edgecolor='grey')
ax2.set_xticklabels(
ax2.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax2.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax2.set_xlabel('Artist', fontsize=15, style='italic',color='#4f4e4e')
ax2.set_ylabel('Number of new releases', fontsize=15, style='italic',color='#4f4e4e')
ax2.set_title('03-06-2018 to 05-15-2018', fontsize=15,fontweight='bold',color='#4f4e4e')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.set(ylim=(0, 20))
palette=matplotlib.cm.ScalarMappable(cmap='Greens').to_rgba(year_2019.head(10)['new'])
sns.barplot(x="Artist", y="new", data = year_2019.head(10), ax=ax3, palette=palette, saturation=1,edgecolor='grey')
ax3.set_xticklabels(
ax3.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax3.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax3.set_xlabel('Artist', fontsize=15, style='italic',color='#4f4e4e')
ax3.set_ylabel('Number of new releases', fontsize=15, style='italic',color='#4f4e4e')
ax3.set_title('03-06-2019 to 05-15-2019', fontsize=15, fontweight='bold', color='#4f4e4e')
ax3.spines['right'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.set(ylim=(0, 20))
palette=matplotlib.cm.ScalarMappable(cmap='Reds').to_rgba(year_2020.head(10)['new'])
sns.barplot(x="Artist", y="new", data = year_2020.head(10), ax=ax4, palette=palette, saturation=1,edgecolor='grey')
ax4.set_xticklabels(
ax4.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax4.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax4.set_xlabel('Artist', fontsize=15, style='italic', color='#4f4e4e')
ax4.set_ylabel('Number of new releases', fontsize=15, style='italic', color='#4f4e4e')
ax4.set_title('03-06-2020 to 05-15-2020', fontsize=15, fontweight='bold', color='#4f4e4e')
ax4.spines['right'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.set(ylim=(0, 20))
[(0.0, 20.0)]
If we analyze the file it_2020-03-27--2020-04-03, we can notice 6 new releases by Marracash!
We retrieve also the top 200 corresponding to the week of the release of the album, which is the 1st November 2019. In the second dataframe, we can notice 15 tracks from Marracash in the top 200, which basically are all the tracks from the album Persona.
The tracks are the following:
We can clearly notice that all the tracks contained in the first dataframe, except for SPORT + muscoli (RMX) are also present in the second dataframe, but they have different release dates and different IDs!
Why? We're going to discover this later.
First thing first, a track which is present in both dataframes and check if it's actually the same doing a query to the Spotify API. We're going to use SUPREME - L'Ego
import os
import pandas as pd
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
file_name = 'it_2020-03-27--2020-04-03.csv'
file = os.path.join(file_folder,file_name)
df=pd.read_csv(file)
df = df.filter(items=['Position', 'Track Name', 'Artist', 'ID', 'release date'])
df['new']=0
lower_date = file_name[3:13]
print(lower_date)
d0 = datetime.datetime.strptime(lower_date, date_format).date()
d1 = d0+datetime.timedelta(days=7)
for index, row in df.iterrows():
date = row["release date"]
try:
date=datetime.datetime.strptime(date, '%Y-%m-%d').date()
if d0 <= date < d1:
df.iloc[index, df.columns.get_loc('new')]= 1
except ValueError:
pass
df = df[df['Artist']== 'Marracash']
display(df)
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
file_name = 'it_2019-11-01--2019-11-08.csv'
file = os.path.join(file_folder,file_name)
df=pd.read_csv(file)
df = df.filter(items=['Position', 'Track Name', 'Artist', 'ID', 'release date'])
df['new']=0
lower_date = file_name[3:13]
print(lower_date)
d0 = datetime.datetime.strptime(lower_date, date_format).date()
d1 = d0+datetime.timedelta(days=7)
for index, row in df.iterrows():
date = row["release date"]
try:
date=datetime.datetime.strptime(date, '%Y-%m-%d').date()
if d0 <= date < d1:
df.iloc[index, df.columns.get_loc('new')]= 1
except ValueError:
pass
df = df[df['Artist']== 'Marracash']
display(df)
2020-03-27
| Position | Track Name | Artist | ID | release date | new | |
|---|---|---|---|---|---|---|
| 2 | 3 | SPORT + muscoli (RMX) (feat. Lazza, Paky, Luch... | Marracash | 7srDOLMwjMC07Dzs7mqCnv | 2020-03-27 | 1 |
| 22 | 23 | BRAVI A CADERE - I polmoni | Marracash | 3me7fQdjgO8NHscb3xPaBa | 2020-03-27 | 1 |
| 28 | 29 | SUPREME - L'ego (feat. tha Supreme & Sfera Ebb... | Marracash | 5EpBEqf5bnKeJ3p4zA1Sod | 2020-03-27 | 1 |
| 33 | 34 | NEON - Le Ali (feat. Elisa) | Marracash | 76cEI7LczlKZ1yLAHIoPBJ | 2020-03-27 | 1 |
| 35 | 36 | CRUDELIA - I nervi | Marracash | 5dsz7MTrNdN9aMTrnp7sOG | 2020-03-27 | 1 |
| 95 | 96 | MADAME - L'anima (feat. Madame) | Marracash | 3go5ZuMoFTKl5ugai8nAKi | 2020-03-27 | 1 |
2019-11-01
| Position | Track Name | Artist | ID | release date | new | |
|---|---|---|---|---|---|---|
| 0 | 1 | SUPREME - L'ego (feat. tha Supreme & Sfera Ebb... | Marracash | 020wG4EHTwBy5h2rbpxqvK | 2019-10-31 | 0 |
| 1 | 2 | CRUDELIA - I nervi | Marracash | 1OeIJITrlUR4qss2kywMEn | 2019-10-31 | 0 |
| 3 | 4 | APPARTENGO - Il sangue (feat. Massimo Pericolo) | Marracash | 2tpfOljBdskx8G331TjRps | 2019-10-31 | 0 |
| 4 | 5 | QUELLI CHE NON PENSANO - Il cervello (feat. Coez) | Marracash | 6l1Jnqfu7GAOV4MDgA2bNP | 2019-10-31 | 0 |
| 5 | 6 | QUALCOSA IN CUI CREDERE - Lo scheletro (feat. ... | Marracash | 39hRcGieqFaKfZ7LN91Vy5 | 2019-10-31 | 0 |
| 6 | 7 | MADAME - L'anima (feat. Madame) | Marracash | 2wL96n9ToLGhekNTFrPhxj | 2019-10-31 | 0 |
| 8 | 9 | BRAVI A CADERE - I polmoni | Marracash | 4SpV49wwekArulAVjEez3i | 2019-10-31 | 0 |
| 9 | 10 | SPORT - I muscoli (feat. Luchè) | Marracash | 3HVm4g9bZHo0aMsjzGS7Ro | 2019-10-31 | 0 |
| 10 | 11 | BODY PARTS - I denti | Marracash | 4WOXQYfq3fYVRMcaXuHBAD | 2019-10-31 | 0 |
| 11 | 12 | NON SONO MARRA - La pelle (feat. Mahmood) | Marracash | 0pl6P6vTEB9pNABaD53SgS | 2019-10-31 | 0 |
| 13 | 14 | G.O.A.T. - Il cuore | Marracash | 50c3vq2uUkSV8QrcbcPWW3 | 2019-10-31 | 0 |
| 15 | 16 | POCO DI BUONO - Il fegato | Marracash | 4ykwdc532KqgY4z8CJcUwO | 2019-10-31 | 0 |
| 20 | 21 | DA BUTTARE - Il ca**o | Marracash | 2auOwPOpAxhGmJfIaxS0kQ | 2019-10-31 | 0 |
| 21 | 22 | TUTTO QUESTO NIENTE - Gli occhi | Marracash | 5Lwin5QMDUiC5W39iJ6nsi | 2019-10-31 | 0 |
| 24 | 25 | GRETA THUNBERG - Lo stomaco (feat. Cosmo) | Marracash | 38VKvSaq4ic5k5zCHshjMo | 2019-10-31 | 0 |
We can notice that the album is the same, and also all the artist infos are exactly the same! But there are 2 major differences:
From the previous points and dataframes, we can conclude that 2 tracks have been added later which are SPORT + muscoli (RMX) and NEON - Le Ali, which are present in the first dataframe, but not in the official track list of the album Persona.
We discovered that adding extra tracks such as remixes, collaboration, featurings ecc... to an album, will automatically update the release date of all the tracks in the album!
In fact, if you visit the following link which contains the album Persona: https://open.spotify.com/album/19iZTn6IM82raMquk5Z7Ul where you can see the release date is 2019
While if you visit this link: https://open.spotify.com/album/3ZOt77e63uMgJXU7xcFpqu which contains the same album, but the release date is 2020.
It is worth noticing that the same track from the 2 album versions has the same identical number of streams, this could give some hints about spotify music storage and format. It seems that the track is actually the same, but it has different ID and release date, and it's present in two different versions of the same album.
The track we're using as test is SUPREME - L'ego.
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import os
import requests
import json
import time
import pprint
credentials = SpotifyClientCredentials(client_id="6b8119b6cec94f538151d6bb7fd99acd", client_secret="bb3cc9f8fdb04ddb9516ab01c7bfc257")
token = credentials.get_access_token()
spotify = spotipy.Spotify(auth=token)
first_dataframe_track = spotify.track('5EpBEqf5bnKeJ3p4zA1Sod')
print('-------------First result-------------')
print('Artist infos:')
pprint.pprint(first_dataframe_track['album']['artists'])
pprint.pprint('Album name: '+first_dataframe_track['album']['name'])
pprint.pprint('Release date: '+first_dataframe_track['album']['release_date'])
pprint.pprint('Album number of tracks: '+ str(first_dataframe_track['album']['total_tracks']))
second_dataframe_track = spotify.track('020wG4EHTwBy5h2rbpxqvK')
print('------------Second result-------------')
print('Artist infos:')
pprint.pprint(second_dataframe_track['album']['artists'])
pprint.pprint('Album name: '+second_dataframe_track['album']['name'])
pprint.pprint('Release date: '+second_dataframe_track['album']['release_date'])
pprint.pprint('Album number of tracks: '+ str(second_dataframe_track['album']['total_tracks']))
-------------First result-------------
Artist infos:
[{'external_urls': {'spotify': 'https://open.spotify.com/artist/5AZuEF0feCXMkUCwQiQlW7'},
'href': 'https://api.spotify.com/v1/artists/5AZuEF0feCXMkUCwQiQlW7',
'id': '5AZuEF0feCXMkUCwQiQlW7',
'name': 'Marracash',
'type': 'artist',
'uri': 'spotify:artist:5AZuEF0feCXMkUCwQiQlW7'}]
'Album name: Persona'
'Release date: 2020-03-27'
'Album number of tracks: 17'
------------Second result-------------
Artist infos:
[{'external_urls': {'spotify': 'https://open.spotify.com/artist/5AZuEF0feCXMkUCwQiQlW7'},
'href': 'https://api.spotify.com/v1/artists/5AZuEF0feCXMkUCwQiQlW7',
'id': '5AZuEF0feCXMkUCwQiQlW7',
'name': 'Marracash',
'type': 'artist',
'uri': 'spotify:artist:5AZuEF0feCXMkUCwQiQlW7'}]
'Album name: Persona'
'Release date: 2019-10-31'
'Album number of tracks: 15'
The previous considerations hold also for Ghali. He added two new songs to his DNA album released on the 21st February 2020, which are Cacao and Hasta la vista.
Everything we said so far gives us two important hints:
Said that, Ghali and Marracash released only 2 and 3 new tracks in that period, which are remixes and/or featuring.
The only major album release by an italian artist have been GarbAge by Nitro. We can conclude that the pandemic had a relevant impact on the work of artists, as we had already thought, and the lack of new releases impacted on the average number of streams on Spotify.
Remarks: Our conclusion are based on the top 200 weekly, thus we can't conclude that the whole Spotify platform had a lower amount of active users/streams, and also that the overall number of releases by artists has decreased. We can only conclude that in the mainstream/commercial scene, there haven't been new releases by famous artists.
It could be interesting to filter and analyze only italian releases and compare their impact on the Spotify charts with respect to international releases, but unfortunately Spotify doesn't give any information about the language or release nation of the songs.
In could be also interesting to analyze and further investigate if there have been significant changes overall on the platform, if there have been a major change in the listened genres and so on but, again, we don't have access to the needed data.
The following article shows how people were more keen to listen to lo-fi chill music during the quarantine if compared to the "normality": https://blog.chartmetric.com/covid-19-effect-on-the-global-music-business-part-1-genre/
Unfortunately we can't show/prove these changes since our data is limited to the top 200, mainstream music hasn't seen major changes in the most listened genres.
In the following cells we're going to plot the streams trend for different nations worldwide. We decided to restrict our analysis only to 6 nations due to the high amount of time needed to download the data from the internet (due to rate download limitations of the provided APIs).
In particular we focused on major EU countries + USA and Brazil, focusing on countries that faced lockdown measures against COVID.
#avg streams during quarantine per state. Used in heatmap
import os
import pandas as pd
from IPython.display import display, HTML
path = os.getcwd()
#file_folder = os.path.join(path,'it_features')
years = [2020]
states = ['it','es','fr','de','br','us']
list_data= []
for state in states:
string = state + '_features'
file_folder = os.path.join(path,string)
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
avg_streams = df_temp["Streams"].mean()
#print(avg_streams)
day = short_name[-2:]
month =short_name[8:10]
date = str(year) + '-' + month + '-' + day
#never append with a cycle to a dataframe. Create a list first and then convert to a dataframe https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe
temp = [avg_streams,date,year,number_of_weeks,state]
list_data.append(temp)
number_of_weeks=number_of_weeks+1
#print(list_data)
df = pd.DataFrame(list_data,columns = ["avg_streams","date","year","number_of_weeks","state"])
lower_bound = 4
upper_bound = 30
df = df[(df['number_of_weeks'] >= lower_bound) & (df['number_of_weeks'] <= upper_bound)] #25
df.date = pd.to_datetime(df.date)
df.set_index(df.date, inplace=True)
df.sort_index(inplace = True)
We applied a Z-Score normalization to our data for each nation. Data among nations was heterogeneous, compare streams on an absolute level generated untreatable plots: USA had an average number of streams 10 times higher than other countries.
Therefore we normalize our data to make it comparable.
states=df['state'].unique()
for state in states:
temp_df=df[(df['state'] == state)]
#display(temp_df)
mean_value = temp_df['avg_streams'].mean()
std = temp_df['avg_streams'].std()
temp_df['avg_streams']=temp_df['avg_streams'].subtract(mean_value)
temp_df['avg_streams']=temp_df['avg_streams'].divide(std)
#display(temp_df)
df.loc[df['state'] == state, 'avg_streams'] = temp_df['avg_streams']
The plots show a common decreasing trend around 15 weeks among all states with an increasing trend towards summer.
# Plot the lines on two facets
import seaborn as sns
from matplotlib.dates import DayLocator, HourLocator, DateFormatter, drange
palette = sns.color_palette("tab10",6)
g1 = sns.relplot(
data=df,
x="date", y="avg_streams",
hue='state',col="state",
kind="line", palette=palette,
height=5, aspect=1, facet_kws=dict(sharex=False), col_wrap=3, linewidth=5, zorder= 5
)
axes = g1.fig.axes
states = df.state.unique()
for index, state in enumerate (states, start=0):
df_temp = df.loc[df['state'] != state]
palette1 = {c:'lightgrey' for c in df_temp.state.unique()}
sns.lineplot(ax=axes[index],x=df_temp.index, y="avg_streams", hue="state", data=df_temp, palette=palette1)
axes[index].xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
axes[index].tick_params(which="both", bottom=True)
axes[index].xaxis.set_minor_locator(mdates.DayLocator(interval=7))
axes[index].get_legend().remove()
if index<3:
axes[index].set_xlabel('')
The following charts show Spotify's musical features over time and the pandemic's influence on them. As previously mentioned, those results refer only to the top 50 because many song remained in the top 200 for many weeks, so that no absolute conclusions concerning global music trends can be drawn. However, we could still infer some critical information on the most popular songs and see if there has been any shift in the users' preferences in the period under study.
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
from IPython.display import display, HTML
import datetime
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
df = pd.DataFrame()
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[15:]:
short_name = csv_file[15:]
#print(short_name)
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
year = short_name[0:4]
month = short_name[5:7]
day =short_name[8:10]
date = year + '-' + month + '-' + day
#print(date)
df_temp['date'] = date
df_temp.date = pd.to_datetime(df_temp.date)
df_temp.set_index(df_temp.date, inplace=True)
df_temp.drop(columns=['date'], inplace=True)
df = pd.concat([df, df_temp])
#print(list_data)
sns.set_style("whitegrid")
df.sort_index(inplace = True)
df['year'] = df.index.year
top50 = df[df['Position']>=50]
top50 = top50.resample('W').mean()
idx = pd.MultiIndex.from_arrays([
pd.to_datetime(top50.index.strftime('2020-%m-%d %H:%M:%S')),
top50.index.year
])
d1 = top50.set_index(idx).unstack().resample('W').mean()
Features distributions don't change significantly across years.
The only relevant change is in speechiness: it has a higher maximum value in 2020 if compared to previous years.
top50 = df[(df.index.month < 8) & (df['Position']<=50)]
fig, axs = plt.subplots(2,3)
fig.set_size_inches(18.5, 10.5)
fig.suptitle('Features between January and August', fontsize=20)
axs[-1, -1].axis('off')
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
#energy
energy = sns.violinplot(ax=axs[0,0], x=top50.index.year, y="energy", data=top50, palette=flatui)
valence = sns.violinplot(ax=axs[0,1], x=top50.index.year, y='valence', data=top50, palette=flatui)
acoustic = sns.violinplot(ax=axs[0,2], x=top50.index.year, y='acousticness', data=top50, palette=flatui)
speech = sns.violinplot(ax=axs[1,0], x=top50.index.year, y='speechiness', data=top50, palette=flatui)
dance = sns.violinplot(ax=axs[1,1], x=top50.index.year, y='danceability', data=top50, palette=flatui)
energy.set_title('ENERGY')
valence.set_title('VALENCE')
acoustic.set_title('ACOUSTICNESS')
speech.set_title('SPEECHINESS')
dance.set_title('DANCEABILITY')
energy.set_xlabel('Year')
valence.set_xlabel('Year')
acoustic.set_xlabel('Year')
speech.set_xlabel('Year')
dance.set_xlabel('Year')
Text(0.5, 0, 'Year')
As defined by Spotify, Valence measures the musical positiveness conveyed by a track, using a score between 0.0 and 1.0.
This parameter undergoes some fluctuations over the year and shows some periodicity. In particular, one can notice that every year the average valence increases around Christmas time.
The steep increase during the lockdown
period seems to contrast the periodicity and could be attributed to the tendency to look for comforting and cheerful music in
that period of social isolation. However, considering the significant variance of this feature, combined with the stability of the top 50 in terms of musical genres, may indicate that this abrupt growth happened by chance.
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#b0b6ff", "#9fd4c6", "#d3abff","#EF553B"]
sns.set_style('whitegrid')
plot = sns.lineplot(ax=ax, data=d1.valence, color='year',palette=flatui)
lg = ax.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax.get_lines()
line = lines[3]
props = {'linewidth':5}
line.set(**props)
ax.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax.tick_params(which="both", bottom=True)
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
ax.set(title='VALENCE')
ax.legend(loc="upper right")
ax.set_xlim([datetime.date(2020, 1, 1), datetime.date(2020, 12, 31)])
fig.autofmt_xdate()
fig.tight_layout()
Danceability describes how suitable a track is for dancing based on a combination of musical elements, including tempo, rhythm, and overall regularity. Danceability increased in 2018 and since then has been almost constant, without showing particularly relevant variations even during the lockdown period.
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#b0b6ff", "#9fd4c6", "#d3abff","#EF553B"]
sns.set_style('whitegrid')
plot = sns.lineplot(ax=ax, data=d1.danceability, color='year',palette=flatui)
lg = ax.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax.get_lines()
line = lines[3]
props = {'linewidth':5}
line.set(**props)
ax.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax.tick_params(which="both", bottom=True)
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
ax.set(title='DANCEABILITY')
ax.legend(loc="upper right")
ax.set_xlim([datetime.date(2020, 1, 1), datetime.date(2020, 12, 31)])
ax.set_ylim(0.6, 0.75)
fig.autofmt_xdate()
fig.tight_layout()
Energy measures the overall intensity and activity of a track, as a function of its speed, loudness, and noisiness.
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#b0b6ff", "#9fd4c6", "#d3abff","#EF553B"]
sns.set_style('whitegrid')
plot = sns.lineplot(ax=ax, data=d1.energy, color='year',palette=flatui)
lg = ax.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax.get_lines()
line = lines[3]
props = {'linewidth':5}
line.set(**props)
ax.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax.tick_params(which="both", bottom=True)
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
ax.set(title='ENERGY')
ax.legend(loc="upper right")
ax.set_xlim([datetime.date(2020, 1, 1), datetime.date(2020, 12, 31)])
fig.autofmt_xdate()
fig.tight_layout()
Acousticness, as the name suggests, measures the presence in a track of acoustic intstruments.
It is possible to observe a significant drop during the lockdown period. Nevertheless, considering the complete histoty of this feature, this trend shows some periodicity, indicating that it may not be due to the pandemic.
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#b0b6ff", "#9fd4c6", "#d3abff","#EF553B"]
sns.set_style('whitegrid')
plot = sns.lineplot(ax=ax, data=d1.acousticness, color='year',palette=flatui)
lg = ax.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax.get_lines()
line = lines[3]
props = {'linewidth':5}
line.set(**props)
ax.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax.tick_params(which="both", bottom=True)
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
ax.set(title='ACOUSTICNESS')
ax.legend(loc="upper right")
ax.set_xlim([datetime.date(2020, 1, 1), datetime.date(2020, 12, 31)])
ax.set_ylim(0.10,0.3)
fig.autofmt_xdate()
fig.tight_layout()
Speechiness measures the presence of spoken words in a track. The plot shows no particular COVID-related trends, but a steady increase over the past four years, indicating the growing popularity of rap/hip-hop music, that dominates the top 200, as shown previously.
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#b0b6ff", "#9fd4c6", "#d3abff","#EF553B"]
sns.set_style('whitegrid')
plot = sns.lineplot(ax=ax, data=d1.speechiness, color='year',palette=flatui)
lg = ax.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax.get_lines()
line = lines[3]
props = {'linewidth':5}
line.set(**props)
ax.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax.tick_params(which="both", bottom=True)
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
ax.set(title='SPEECHINESS')
ax.legend(loc="upper right")
ax.set_xlim([datetime.date(2020, 1, 1), datetime.date(2020, 12, 31)])
fig.autofmt_xdate()
fig.tight_layout()
This radar chart is intended to show the comparison between features in different periods of time. It's an interactive plot to allow the user to compare and see the difference of all the features in many period of time.
import plotly.graph_objects as go
df_resampled = df[df['Position']<=50].resample('M').mean()
df_relevant = df_resampled[['danceability','energy','acousticness','valence','speechiness']]
# create figure
fig = go.Figure()
# Add surface trace
fig.add_trace(go.Scatterpolar(r=df_relevant.iloc[0],
theta=df_relevant.columns,
visible=True,
name = str(df_relevant.index[0].year) + '-' + str(df_relevant.index[0].month))
)
# Add surface trace
fig.add_trace(go.Scatterpolar(r=df_relevant.iloc[0],
theta=df_relevant.columns,
visible=True,
name = str(df_relevant.index[0].year) + '-' + str(df_relevant.index[0].month))
)
buttons1 = []
buttons2 = []
# button with one option for each dataframe
for date in df_relevant.index:
year_month = str(date.year) +'-'+str(date.month)
#print(df_relevant[year_month].to_numpy())
buttons1.append({
'method': 'restyle',
'label': year_month,
'args': [
{'r': df_relevant[year_month].to_numpy(),
'type' : 'scatterpolar',
'name' : year_month}, [0]
]
}
)
for date in df_relevant.index:
year_month = str(date.year) +'-'+str(date.month)
#print(df_relevant[year_month].to_numpy())
buttons2.append({
'method': 'restyle',
'label': year_month,
'args': [
{'r': df_relevant[year_month].to_numpy(),
'type' : 'scatterpolar',
'name' : year_month}, [1]
]
}
)
# some adjustments to the updatemenus
updatemenu = []
first_menu = dict()
second_menu = dict()
updatemenu.append(first_menu)
updatemenu.append(second_menu)
updatemenu[0]['buttons'] = buttons1
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True
#updatemenu[0]['pad'] = {"r": 10, "t": 10}
updatemenu[0]['x'] = 0.1
updatemenu[0]['xanchor'] = 'left'
updatemenu[0]['y'] = 1.1
updatemenu[1]['yanchor'] = 'top'
updatemenu[1]['buttons'] = buttons2
updatemenu[1]['direction'] = 'down'
updatemenu[1]['showactive'] = True
updatemenu[1]['x'] = 0.1
updatemenu[1]['xanchor'] = 'left'
#updatemenu[1]['pad'] = {"r": 10, "t": 10}
# add dropdown menus to the figure
fig.update_layout(updatemenus=updatemenu)
fig.update_layout(
annotations=[
dict(text="Date 1:", showarrow=False,
x=0, y=1.085, yref="paper", align="left",
font = dict(
color='blue'
)),
dict(text="Date 2:", showarrow=False,
x=0, y=0.98, yref="paper", align="left",
font = dict(
color='red'
))
]
)
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0.1, 0.75]
)),
showlegend=True
)
fig.update_traces(fill='toself')
fig.show()
Our analysis showed that, limited to the data available to us, COVID 19 could have had an impact on music streamings.
From the very first plot we noticed a decrease in the number of streams during the quarantine period in Italy.
import os
import pandas as pd
import matplotlib.dates as mdates
import matplotlib as plt
from matplotlib.dates import DateFormatter
from IPython.display import display, HTML
import datetime
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
avg_streams = df_temp["Streams"].mean()
#print(avg_streams)
week = short_name[-2:]
month =short_name[8:10]
date = str(year) + '-' + month + '-' + week
#never append with a cycle to a dataframe. Create a list first and then convert to a dataframe https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe
temp = [avg_streams,date,year,number_of_weeks]
list_data.append(temp)
number_of_weeks=number_of_weeks+1
#print(list_data)
df = pd.DataFrame(list_data,columns = ["avg_streams","date","year","number of weeks"])
df.date = pd.to_datetime(df.date)
df.set_index(df.date, inplace=True)
df.sort_index(inplace = True)
idx = pd.MultiIndex.from_arrays([
pd.to_datetime(df.index.strftime('2018-%m-%d %H:%M:%S')),
df.index.year
])
d1 = df.set_index(idx).unstack().resample('W').mean()
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib import dates
import seaborn as sns
import numpy as np
fig, ax = plt.subplots(1, 1, figsize=(15, 9))
flatui = ["#b0b6ff", "#9fd4c6", "#d3abff","#EF553B"]
sns.set_style('whitegrid')
plot = sns.lineplot(ax=ax, data=d1.avg_streams, color='year', palette=flatui)
lg = ax.legend(bbox_to_anchor=(1.02, 1), loc=2, ncol=1)
lines = ax.get_lines()
line = lines[3]
props = {'linewidth':5}
line.set(**props)
ax.axvspan(*mdates.datestr2num(['2018-03-09', '2018-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax.tick_params(which='major', labelsize=13)
ax.tick_params(which="both", bottom=True)
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
ax.set_title(label='Streams trend', size='15')
ax.legend(loc="upper right")
ax.set_xlabel(xlabel='Date', size='15', style='italic')
ax.set_ylabel(ylabel='Streams', size='15', style='italic')
ax.set_xlim([datetime.date(2018, 1, 1), datetime.date(2018, 12, 31)])
fig.autofmt_xdate()
fig.tight_layout()
We also noticed a strong correlation between the number of streams and the speechiness feature, thanks to our domain knowledge (being italian and musicians) we supposed that Hip-Hop, Rap and Trap song had a major impact on Spotify streams trend.
In order to confirm this hypothesis, we decided to analyze the major peaks in the streams trend plot. Our analysis showed that we were right: major peaks in streams corresponded to musical events (Sanremo festival) and Rap, Hip-Hop and Trap releases.
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
from IPython.display import display, HTML
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
df = pd.DataFrame()
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[15:]:
short_name = csv_file[15:]
#print(short_name)
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
year = short_name[0:4]
month = short_name[5:7]
day =short_name[8:10]
date = year + '-' + month + '-' + day
#print(date)
df_temp['date'] = date
df_temp.date = pd.to_datetime(df_temp.date)
df_temp.set_index(df_temp.date, inplace=True)
df_temp.drop(columns=['date'], inplace=True)
df = pd.concat([df, df_temp])
df.sort_index(inplace = True)
df_resampled = df.resample('W').mean()
df_relevant = df_resampled.drop(columns=['Unnamed: 0','Unnamed: 0.1','index','Position','duration_ms'])
df_relevant = df_relevant.filter(items=['Streams','energy','acousticness','loudness','speechiness'])
from scipy.stats import pearsonr
def corrfunc(x,y, ax=None, **kws):
"""Plot the correlation coefficient in the top left hand corner of a plot."""
r, _ = pearsonr(x, y)
ax = ax or plt.gca()
# Unicode for lowercase rho (ρ)
rho = '\u03C1'
ax.annotate(f'{rho} = {r:.2f}', xy=(.1, .9), xycoords=ax.transAxes)
def corrdot(*args, **kwargs):
corr_r = args[0].corr(args[1], 'pearson')
corr_text = f"{corr_r:2.2f}".replace("0.", ".")
ax = plt.gca()
ax.set_axis_off()
marker_size = abs(corr_r) * 10000
ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="Blues",
vmin=-1, vmax=1, transform=ax.transAxes)
font_size = abs(corr_r) * 40 + 5
ax.annotate(corr_text, [.5, .5,], xycoords="axes fraction",
ha='center', va='center', fontsize=font_size)
# g = sns.pairplot(stocks,palette=["Blues_d"])
g = sns.PairGrid(df_relevant, aspect=1.4, diag_sharey=False)
g.map_lower(corrfunc)
g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'Black','linewidth':1})
g.map_diag(sns.distplot, kde_kws={'color': 'Black','linewidth':1})
g.map_upper(corrdot)
plt.show()
import os
import pandas as pd
import matplotlib.dates as mdates
import matplotlib as plt
from matplotlib.dates import DateFormatter
from IPython.display import display, HTML
import datetime
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
avg_streams = df_temp["Streams"].mean()
#print(avg_streams)
week = short_name[-2:]
month =short_name[8:10]
date = str(year) + '-' + month + '-' + week
#never append with a cycle to a dataframe. Create a list first and then convert to a dataframe https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe
temp = [avg_streams,date,year,number_of_weeks]
list_data.append(temp)
number_of_weeks=number_of_weeks+1
#print(list_data)
df = pd.DataFrame(list_data,columns = ["avg_streams","date","year","number of weeks"])
df.date = pd.to_datetime(df.date)
df.set_index(df.date, inplace=True)
df.sort_index(inplace = True)
idx = pd.MultiIndex.from_arrays([
pd.to_datetime(df.index.strftime('2018-%m-%d %H:%M:%S')),
df.index.year
])
d1 = df.set_index(idx).unstack().resample('W').mean()
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
fig = plt.figure(figsize=(15,9))
ax1 = fig.add_subplot()
sns.set_style('whitegrid')
#flatui = ["#636EFA", "#00CC96", "#AB63FA","#EF553B"]
flatui = ["#DCDCDC", "#DCDCDC", "#AB63FA","#DCDCDC"]
plot = sns.lineplot(ax=ax1, data=d1.avg_streams, color='years', palette=flatui)
lines = ax1.get_lines()
line = lines[2]
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%b-%d'))
ax1.tick_params(which="both", bottom=True)
ax1.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
#ax1.axvspan(*mdates.datestr2num(['2020-03-09', '2020-05-18']), color='salmon', alpha=0.2, label="Italy lockdown")
style = dict(size=15, color='grey', style='italic', weight=100)
ax1.text('2018-02-11', 720000, "Sanremo", ha='center', **style)
ax1.text('2018-06-17', 830000,"MACHETE \n Mixtape", ha='center', **style)
ax1.text('2018-10-21', 850000, "tha Supreme", ha='center', **style)
props = {'marker':'o','markersize':15,'markeredgewidth':1.5,
'markeredgecolor':'black','markevery': [45,26,5], 'zorder': 5, 'linewidth':5}
line.set(**props)
#use plt.txt() to add label next to the hilighted point
ax1.set_title(label='Streams trend', fontsize='15')
ax1.set_xlabel(xlabel='# of weeks', fontsize='15', style='italic')
ax1.set_ylabel(ylabel='Streams', fontsize='15', style='italic')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.legend(loc="lower right")
<matplotlib.legend.Legend at 0x2a44d7cd1f0>
As a consequence supposed that the decreasing number of streams could be caused of a smaller number of Rap, Hip-Hop and Trap releases. We filtered our dataset selecting only the top200 corresponding to the quarantine period (from March to May) for all years and we plotted the number of new Hip-Hop/Rap releases.
We noticed that in 2020 only Nitro released a new album which didn't impact positively the streams trend: we suppose that this has been caused by the coincidence of the release date of the album with the lockdown announcement by the italian government, people's attention has been caught by the news.
As we already discussed in our report, new releases by Ghali and Marracash are actually fake data: the addition of late remixes to the album, refreshes the release date of all the songs contained inside it.
The most famous songs of the album (which usually remain for a while in the top 200) are considered again as new releases, even if they're not.
Thus our hypothesis is not so wrong, probably the negative trend of streamings has been caused by a lack of new releases.
# Selecting the lockdown period and the plotting the artists with a high number of releases in that period
import os
import pandas as pd
from IPython.display import display, HTML
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
path = os.getcwd()
file_folder = os.path.join(path,'it_features')
years = [2017,2018,2019,2020]
list_data= []
date_format = "%Y-%m-%d"
df = pd.DataFrame(list_data,columns = ["Artist","Track Name","date","year","number_of_weeks", "release date", "new","genre"])
for year in years:
number_of_weeks = 0
for csv_file in os.listdir(file_folder):
if str(year) in csv_file[:13]:
short_name = csv_file[:13]
#print(csv_file[:13])
csv_path = os.path.join(file_folder,csv_file)
df_temp = pd.read_csv(csv_path)
df_temp['new']=0
df_temp=df_temp.filter(["Artist","Track Name", "release date", "new","genre"])
week = short_name[-2:]
month =short_name[8:10]
date = month + '-' + week
temp = [date,year,number_of_weeks]
df_temp['date'] = date
df_temp['year'] = year
df_temp['number_of_weeks'] = number_of_weeks
lower_date = csv_file[3:13]
#print(lower_date)
d0 = datetime.datetime.strptime(lower_date, date_format).date()
d1 = d0+datetime.timedelta(days=7)
for index, row in df_temp.iterrows():
#print(index)
date = row["release date"]
#print(date)
try:
date=datetime.datetime.strptime(date, '%Y-%m-%d').date()
if d0 <= date < d1:
#print('in between')
df_temp.iloc[index, df_temp.columns.get_loc('new')]= 1
except ValueError:
#print('Incorrect format')
pass
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#display(test)
#display(df_temp)
#avg_streams = df_temp["Streams"].sum()
#print(avg_streams)
#display(df_temp)
frames = [df,df_temp]
df = pd.concat(frames)
number_of_weeks=number_of_weeks+1
#print(list_data)
lower_bound = 9
upper_bound = 19
df = df[(df['number_of_weeks'] >= lower_bound) & (df['number_of_weeks'] <= upper_bound)]
df = df[df['genre']=='Hip-Hop/Rap']
new_df = df.groupby(['Artist',"year"], sort=True).sum().reset_index()
new_df = new_df.sort_values(by = ['new'], ascending=[False])
new_df=new_df.filter(items=['Artist',"year","new"])
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
import matplotlib
#Forse si può togliere qualche label
year_2017 = new_df.loc[new_df['year'] == 2017]
year_2018 = new_df.loc[new_df['year'] == 2018]
year_2019 = new_df.loc[new_df['year'] == 2019]
year_2020 = new_df.loc[new_df['year'] == 2020]
fig = plt.figure(figsize=(15,12))
gs0 = gridspec.GridSpec(2,2, figure=fig, hspace=0.7)
with sns.axes_style("whitegrid"):
ax1 = fig.add_subplot(gs0[0,0])
ax2 = fig.add_subplot(gs0[0,1])
ax3 = fig.add_subplot(gs0[1,0])
ax4 = fig.add_subplot(gs0[1,1])
palette=matplotlib.cm.ScalarMappable(cmap='Blues').to_rgba(year_2017.head(10)['new'])
sns.barplot(x="Artist", y="new", data = year_2017.head(10), ax=ax1, palette=palette, saturation=1,edgecolor='grey')
ax1.set_xticklabels(
ax1.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax1.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax1.set_xlabel('Artist', fontsize=15, style='italic',color='#4f4e4e')
ax1.set_ylabel('Number of new releases', fontsize=15, style='italic',color='#4f4e4e')
ax1.set_title('03-06-2017 to 05-15-2017', fontsize=15, fontweight='bold',color='#4f4e4e')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.set(ylim=(0, 20))
palette=matplotlib.cm.ScalarMappable(cmap='Purples').to_rgba(year_2018.head(10)['new'])
sns.barplot(x="Artist", y="new", data = year_2018.head(10), ax=ax2, palette=palette, saturation=1,edgecolor='grey')
ax2.set_xticklabels(
ax2.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax2.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax2.set_xlabel('Artist', fontsize=15, style='italic',color='#4f4e4e')
ax2.set_ylabel('Number of new releases', fontsize=15, style='italic',color='#4f4e4e')
ax2.set_title('03-06-2018 to 05-15-2018', fontsize=15,fontweight='bold',color='#4f4e4e')
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.set(ylim=(0, 20))
palette=matplotlib.cm.ScalarMappable(cmap='Greens').to_rgba(year_2019.head(10)['new'])
sns.barplot(x="Artist", y="new", data = year_2019.head(10), ax=ax3, palette=palette, saturation=1,edgecolor='grey')
ax3.set_xticklabels(
ax3.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax3.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax3.set_xlabel('Artist', fontsize=15, style='italic',color='#4f4e4e')
ax3.set_ylabel('Number of new releases', fontsize=15, style='italic',color='#4f4e4e')
ax3.set_title('03-06-2019 to 05-15-2019', fontsize=15, fontweight='bold', color='#4f4e4e')
ax3.spines['right'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.set(ylim=(0, 20))
palette=matplotlib.cm.ScalarMappable(cmap='Reds').to_rgba(year_2020.head(10)['new'])
sns.barplot(x="Artist", y="new", data = year_2020.head(10), ax=ax4, palette=palette, saturation=1,edgecolor='grey')
ax4.set_xticklabels(
ax4.get_xticklabels(),
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
ax4.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax4.set_xlabel('Artist', fontsize=15, style='italic', color='#4f4e4e')
ax4.set_ylabel('Number of new releases', fontsize=15, style='italic', color='#4f4e4e')
ax4.set_title('03-06-2020 to 05-15-2020', fontsize=15, fontweight='bold', color='#4f4e4e')
ax4.spines['right'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.set(ylim=(0, 20))
[(0.0, 20.0)]
We also noticed no major changes in audio features, their distribution remained the same. This is kind of natural: mainstream music is dominated by Pop, Hip-Hop and Rap songs, thus analyzing only the top 200 will not show significant changes in features distribution.
However, our conclusion are limited only to the top200 and are not valid in general.